## R version 3.3.3 (2017-03-06)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 8 (jessie)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] backports_1.1.2 magrittr_1.5 rprojroot_1.3-2 tools_3.3.3
## [5] htmltools_0.3.6 Rcpp_0.12.15 stringi_1.1.6 rmarkdown_1.8
## [9] knitr_1.20 stringr_1.3.0 digest_0.6.15 evaluate_0.10.1
Our Apache Yarn cluster hosts the flights data representing 123 million flights over 22 years. Read the lecture notes on how to access the Yarn cluster. Connect to the database using sparklyr and answer following questions. You can base your answers on a specific year or the whole data set.
#Map the top 10 busiest airports. Size of dots should reflect the number of
#flights through that destination.
# Determine the n count of flight origins
originCount <- flights_tbl %>%
filter(year == 1991) %>%
group_by(origin) %>%
tally() %>%
arrange(desc(n)) %>%
collect() %>%
head(15)
# Determine the n count of flight destinations
departureCount <- flights_tbl %>%
filter(year == 1991) %>%
group_by(dest) %>%
tally() %>%
arrange(desc(n)) %>%
collect() %>%
head(15)
colnames(originCount) <- c("origin", "outgoing")
colnames(departureCount) <- c("destination", "incoming")
# Determine the "busiest" airports by summing incoming and outcoming flights.
flightCount <-originCount %>%
inner_join(departureCount, by = c("origin" = "destination")) %>%
transmute(airport = origin, totalFlights = incoming + outgoing) %>%
arrange(desc(totalFlights)) %>%
head(10)
# Location Retrieval.
topTenLocations <- flightCount %>%
inner_join(airports_tbl, by = c("airport" = "faa"), copy = TRUE) %>%
select(airport, totalFlights, lon, lat) %>%
mutate(lat = as.double(lat),
lon = as.double(lon),
proportion = totalFlights/sum(totalFlights)) %>%
collect()
# Create compressed RDS file.
#library(tidyverse)
##write_rds(topTenLocations, "hw4/busiestAirports.rds")
# Create a plot of the US MAP.
usmap <- map_data("usa")
states <- map_data("state")
map <- get_map(location = "united states", zoom = 4)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false
busyUSports <- ggmap(map) +
geom_point(data = topTenLocations,
aes(x = lon, y = lat, size = totalFlights),
color = "black") +
geom_label_repel(data = topTenLocations,
aes(x= lon, y = lat, label = airport
), min.segment.length = 0, size = 3.5) +
ggtitle("Busiest US Airports in 1991")
busyUSports
#Map the top 10 busiest direct routes.
#Size of lines should reflect the number of flights through that route.
topRoutes <-flights_tbl %>%
filter(year == 2002) %>%
group_by(origin, dest) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
collect() %>%
head(10)
# Retrieve latitude and longitide of origin and dest
topTenOrigin<- topRoutes %>%
inner_join(airports_tbl, by = c("origin" = "faa"), copy = TRUE) %>%
select(origin, lat, lon, name,n) %>%
mutate(latO = as.double(lat),
lonO = as.double(lon)) %>%
collect()
topTenDest <- topRoutes %>%
inner_join(airports_tbl, by = c("dest" = "faa"), copy = TRUE) %>%
select(origin, lat, lon, name,n) %>%
mutate(latD = as.double(lat),
lonD = as.double(lon)) %>%
collect()
q2 <- bind_cols(topTenOrigin, topTenDest)
q2 <- q2 %>%
mutate(scale = (n / max(n)) ^ 10)
# For the labels, we want to remove duplicate origins so that its not cluttered: 3 different ways to do so:
q2Sliced <- q2 %>%
group_by(origin) %>%
filter(row_number() == 1)
q2Sliced2 <- q2 %>%
.[c(1:6, 8),]
q2Sliced3 <- q2 %>%
slice(c(1:6,8))
### google map
map <- get_map(location = "united states", zoom = 4)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=united+states&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=united%20states&sensor=false
ggmap(map) +
geom_point(data = q2,
aes(x = lonO, y = latO),
colour = "red", alpha = 0.2) +
geom_point(data = q2,
aes(x = lonD, y = latD),
colour = "red", alpha = 0.2)+
geom_curve(data = q2,
aes(x = lonO, y = latO, xend = lonD, yend = latD),
arrow = arrow(angle = 15, ends = "first", length = unit(0.2, "cm"), type = "closed"),
alpha = 0.7, curvature = 0.15, inherit.aes = TRUE) +
scale_size_continuous(range=c(1,30)) +
coord_cartesian()
#### simple map
unitedStates <- ggplot(data = states) +
geom_polygon(aes(x = long, y = lat, fill = region, group = group), color = "white") +
coord_fixed(1.3) +
geom_point(data = q2,
aes(x = lonO, y = latO),
colour = "red", alpha = 0.2) +
geom_point(data = q2,
aes(x = lonD, y = latD),
colour = "red", alpha = 0.2)+
geom_curve(data = q2,
aes(x = lonO, y = latO, xend = lonD, yend = latD),
arrow = arrow(angle = 15, ends = "first", length = unit(0.5, "cm"), type = "closed"),
alpha = 0.7, curvature = 0.15, size = q2$scale, inherit.aes = TRUE) +
scale_size_continuous(range=c(1,30)) +
coord_cartesian() +
guides(fill=FALSE) +
geom_label_repel(data = q2Sliced,
aes(x= lonO, y = latO, label = q2Sliced$origin, size = 2)
, min.segment.length = 0, size = 3.5) +
ggtitle("Busiest US Routes in 2002")
unitedStates
LAX:
(a). Reproduce above plot. Visualize and explain some prominent features you observe. For example, what happened at points 1-5?
#(a). Reproduce above plot. Visualize and explain some prominent features you observe. For example, what happened at points 1-5?
laxFlights <-flights_tbl %>%
filter(origin == "LAX" | dest == "LAX") %>%
select(origin, dayofmonth, month, year) %>%
collect()
#Subset Between 2008 and 1998 Years of LAX flights.
laxFlightDate <- laxFlights %>%
mutate(date = make_date(year, month, dayofmonth)) %>%
filter(date >= ymd(19980101) & date <= ymd(20081231)) %>%
collect()
# Create compressed RDS file.
#write_rds(laxFlightDate, "hw4/laxflightdates.rds")
# Plot
laxFlightDate %>%
group_by(date) %>%
count() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
geom_label(aes(x = ymd(20011230), y = 1110, label = "1")) +
geom_label(aes(x = ymd(20041201), y = 1000, label = "2")) +
geom_label(aes(x = ymd(20040630), y = 950, label = "3")) +
geom_label(aes(x = ymd(20080101), y = 1300, label = "4")) +
geom_label(aes(x = ymd(20010130), y = 1150, label = "5")) +
ggtitle("LAX air traffic") +
coord_fixed()
# Visualize prominent features.
# Point 1
laxFlightDate %>%
group_by(date) %>%
filter(date >= ymd(20010901) & date <= ymd(20011201)) %>%
group_by(date) %>%
count() %>%
arrange(n) %>%
collect() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
ggtitle("Point 1")
# Point 2
laxFlightDate %>%
group_by(date) %>%
filter(date >= ymd(20040731) & date <= ymd(20050101)) %>%
group_by(date) %>%
count() %>%
arrange(n) %>%
collect() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
ggtitle("Point 2")
# Point 3
laxFlightDate %>%
group_by(date) %>%
filter(date >= ymd(20040301) & date <= ymd(20041101)) %>%
group_by(date) %>%
count() %>%
arrange(n) %>%
collect() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
ggtitle("Point 3")
# Point 4
laxFlightDate %>%
group_by(date) %>%
filter(date >= ymd(20070928) & date <= ymd(20080130)) %>%
group_by(date) %>%
count() %>%
arrange(n) %>%
collect() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
ggtitle("Point 4")
# Point 5
laxFlightDate %>%
group_by(date) %>%
filter(date >= ymd(20001115) & date <= ymd(20010225)) %>%
group_by(date) %>%
count() %>%
arrange(n) %>%
collect() %>%
ggplot(aes(x = date, y = n)) +
geom_line() +
ggtitle("Point 5")
(b). Visualize and explain seasonal effects.
flightsBySeason <-flights_tbl %>%
select(origin, dest, month, year) %>%
filter(year >= 1998 && year <= 2008) %>%
filter(dest == "LAX" | origin == "LAX") %>%
mutate(season = if_else(month %in% c(3, 4, 5), "spring",
if_else(month %in% c(6, 7, 8), "summer",
if_else(month %in% c(9,10, 11), "fall",
if_else(month %in% c(12, 1, 2), "winter", "")
)
)
)
) %>%
#group_by(year, season) %>%
#tally() %>%
count(year, season) %>%
arrange(year) %>%
collect()
ggplot(flightsBySeason) +
geom_col(aes(x = as.factor(year), y = n, fill = season),
color = "black") +
labs(x = "Year", y = "Number of Flights", title = "LAX Air Traffic by Season") +
scale_fill_brewer(palette = "Pastel2")
(c). Visualize and explain weekly effects.
flightsByWeekday <- flights_tbl %>%
select(origin, dest, year, dayofweek) %>%
filter(year >= 1998 && year <= 2008) %>%
filter(dest == "LAX" | origin == "LAX") %>%
count(year, dayofweek) %>%
collect()
flightsByWeekday$dayofweek<-as.factor(flightsByWeekday$dayofweek)
ggplot(flightsByWeekday) +
geom_col(aes(x = as.factor(year), y = n, fill = dayofweek),
color = "black") +
labs(x = "Year", y = "Number of Flights", title = "LAX Air Traffic by Day of Week") +
scale_fill_brewer(palette = "Pastel2")
(d). Map top 10 destinations from LAX. Size of dots should reflect the number of flights from LAX to that destination.
# Filter the routes by destination: LAX.
topRoutes <-flights_tbl %>%
select(origin, dest) %>%
filter(origin == "LAX") %>%
collect() %>%
count(dest) %>%
arrange(desc(n)) %>%
head(10)
# Location Retrieval.
destLAX <- topRoutes %>%
left_join(airports_tbl, by = c("dest" = "faa"), copy = TRUE) %>%
select(dest, lon, lat, n) %>%
mutate(lat = as.double(lat),
lon = as.double(lon)
) %>%
mutate(airport = dest, proportion = n / sum(n))
ggmap(map) +
geom_point(data = destLAX,
aes(x = lon, y = lat, size = proportion),
color = "black") +
geom_label_repel(data = destLAX,
aes(x= lon, y = lat, label = dest, size = proportion
), min.segment.length = 0, size = 3.5) +
labs(title = "Top 10 Locations from LAX")
0. Build a predictive model for the arrival delay (
arrdelay) of flights flying from LAX. Use the same filtering criteria as in the lecture notes to construct training and validation sets. You are allowed to use a maximum of 5 predictors. The prediction performance of your model on the validation data set will be an important factor for grading this question.
# Select data from an out of time sample
system.time(
modelFlights <- flights_tbl %>%
filter(origin == "LAX") %>%
filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
filter(depdelay > 15 & depdelay < 240) %>%
filter(arrdelay > -60 & arrdelay < 360) %>%
filter(year >= 2003 & year <= 2007) %>%
mutate(weekDay = as.character(dayofweek)) %>%
select(year, month, arrdelay, depdelay, distance, weekDay)
)
## user system elapsed
## 0.004 0.000 0.006
sampleSplit <- modelFlights %>%
sdf_partition(train = 0.8, valid = 0.2, seed= 250)
mlr <- sampleSplit$train %>%
ml_linear_regression(arrdelay ~ distance + depdelay + weekDay)
summary(mlr)
## Call: ml_linear_regression.tbl_spark(., arrdelay ~ distance + depdelay + weekDay)
##
## Deviance Residuals (approximate):
## Min 1Q Median 3Q Max
## -193.070 -7.181 -1.515 5.246 334.094
##
## Coefficients:
## (Intercept) distance depdelay weekDay_5 weekDay_7 weekDay_4
## -1.55690020 -0.00176854 1.01481246 0.62609083 0.35561858 0.85893367
## weekDay_1 weekDay_3 weekDay_6
## 0.67183093 0.52893317 -1.22533557
##
## R-Squared: 0.8828
## Root Mean Squared Error: 13.48
# Apply MLR to LAX flight data in 2018.
system.time(
LAX2008 <-flights_tbl %>%
filter(origin == "LAX") %>%
filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
filter(depdelay > 15 & depdelay < 240) %>%
filter(arrdelay > -60 & arrdelay < 360) %>%
filter(year == 2008) %>%
mutate(weekDay = as.character(dayofweek)) %>%
select(year, month, arrdelay, depdelay, distance, weekDay)
)
## user system elapsed
## 0.004 0.000 0.027
#Determine prediction model.
predictionVal2008 <- sdf_predict(LAX2008, mlr) %>%
mutate(squareRes = (prediction - arrdelay)^2) %>%
collect
# Calculate Root Mean Squared Error.
rootmse <- sqrt(sum(predictionVal2008$squareRes)/count(predictionVal2008))
rootmse
## n
## 1 14.27216
# Visualize actual arrival delay against predicted arrival delay.
PredictionData <- predictionVal2008 %>%
group_by(weekDay) %>%
summarize(actual = mean(arrdelay), prediction = mean(prediction), freq = n())
# Plot
ggplot(PredictionData, aes(actual, prediction)) +
geom_point(alpha = 0.75, color = 'red', shape = 3) +
geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') +
geom_text(aes(label = substr(weekDay, 1, 20)), size = 3, alpha = 0.75, vjust = -1) +
labs(title='Actual arrival Delay on Predicted Delay', x = 'Actual', y = 'Predicted')
#Map the top 10 Origins to SFO in 1991.
topSF <-flights_tbl %>%
filter(year == 1991 & dest == "SFO") %>%
group_by(origin, dest) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
collect() %>%
head(10)
# Retrieve latitude and longitide of origin and dest
topTenSF <- topSF %>%
inner_join(airports_tbl, by = c("origin" = "faa"), copy = TRUE) %>%
select(origin, dest, lat, lon, name,n) %>%
mutate(latO = as.double(lat),
lonO = as.double(lon)) %>%
collect()
usmap <- map_data("usa")
states <- map_data("state")
SFOflights <- ggplot(data = states) +
geom_polygon(aes(x = long, y = lat, fill = region, group = group), color = "white") +
coord_fixed(1.3) +
geom_point(data = topTenSF,
aes(x = lonO, y = latO, size = n),
colour = "red", alpha = 0.3) +
geom_point(data = topTenSF,
aes(x = -122.375, y = 37.61899948120117),
colour = "red", alpha = 0.2)+
geom_curve(data = topTenSF,
aes(x = lonO, y = latO, xend = -122.375, yend = 37.61899948120117),
arrow = arrow(angle = 15, length = unit(0.5, "cm"), type = "closed"),
alpha = 0.7, inherit.aes = TRUE) +
scale_size_continuous(range=c(1,30)) +
coord_cartesian() +
guides(fill = FALSE) +
ggtitle("Top 10 Flight Origins to SFO in 1991")
SFOflights